last changed: 2023-08-29
At first, the correlation matrices are required and for simplicity the peak positions are changed to gene symbols.
> #H3K27ac:
> m.H3.1ac <- readRDS("./../Compare_H3K27ac_RNAseq/Data/m_H3.1K27ac_median.rds")
> head(m.H3.1ac)
## T0h T1h T2h T4h
## chr1:181453-181717 3.967340 3.759846 3.836270 3.779723
## chr1:777858-779502 6.843831 6.703328 6.579330 6.871074
## chr1:827213-827667 4.894720 4.822416 5.073162 4.769361
## chr1:903806-905645 7.300799 7.406745 7.178423 7.253228
## chr1:920444-922078 5.087611 6.262964 5.091975 4.968098
## chr1:923270-926516 8.334292 8.984147 8.254235 8.021373
> m.H3.3ac <- readRDS("./../Compare_H3K27ac_RNAseq/Data/m_H3.3K27ac_median.rds")
> head(m.H3.3ac)
## T0h T1h T2h T4h
## chr1:181453-181717 4.338979 4.475285 4.704645 4.461679
## chr1:777858-779502 7.191532 7.140648 6.606008 6.555047
## chr1:827213-827667 4.715587 4.437999 4.523847 4.657809
## chr1:903806-905645 7.245065 7.791045 7.397244 7.279499
## chr1:920444-922078 5.713139 6.483811 5.428020 5.093662
## chr1:923270-926516 8.631627 9.509342 8.500228 8.103128
> peaks.H3ac <- readRDS("./../Compare_H3K27ac_RNAseq/Data/H3K27ac_coding_peaks_1kb.rds") %>%
+ as.data.frame()
> peaks.H3ac <- mutate(peaks.H3ac, peak_pos = paste0(peaks.H3ac$chr, ":", peaks.H3ac$start, "-", peaks.H3ac$end))
> peaks.H3ac <- peaks.H3ac[order(peaks.H3ac$distanceToTSS %>% as.integer() %>% abs(), decreasing = F), ]
> peaks.H3ac <- peaks.H3ac[!duplicated(peaks.H3ac$gene_symbol),]
> peaks.H3ac <- peaks.H3ac[!is.na(peaks.H3ac$gene_symbol), ]
> rownames(peaks.H3ac) <- peaks.H3ac$gene_symbol
> paged_table(peaks.H3ac)
> #RNAseq:
> m.RNAseq <- readRDS("./../Compare_H3K27ac_RNAseq/Data/m_RNAseq_median.rds")
> head(m.RNAseq)
## T0h T1h T2h T4h
## ENSG00000187961 9.472383 9.373587 9.318045 9.252606
## ENSG00000187583 6.887230 6.861752 6.803387 6.664795
## ENSG00000272512 3.451118 3.485096 3.553123 3.609632
## ENSG00000157933 10.105359 10.144625 10.408214 10.423799
## ENSG00000158286 6.990695 6.825305 6.717835 6.642559
## ENSG00000049249 6.356913 6.404449 6.662290 6.626253
The enrichment analysis was performed with Metascape. Here, a list with all genes for each cluster is required which was uploaded and used to perform the enrichment by cluster (NAs need to be excluded).
> clus.genes <- readRDS("./../Enrichment_analysis/Data/RNAseq_genes_cluster.rds")
> lapply(clus.genes, head)
## $`1`
## [1] "ENSG00000116685" "ENSG00000186501" "ENSG00000130772" "ENSG00000162419"
## [5] "ENSG00000196182" "ENSG00000066135"
##
## $`2`
## [1] "ENSG00000187583" "ENSG00000142798" "ENSG00000133216" "ENSG00000088280"
## [5] "ENSG00000158246" "ENSG00000142733"
##
## $`3`
## [1] "ENSG00000187961" "ENSG00000158286" "ENSG00000162496" "ENSG00000160094"
## [5] "ENSG00000142687" "ENSG00000179862"
##
## $`4`
## [1] "ENSG00000272512" "ENSG00000157933" "ENSG00000049249" "ENSG00000171608"
## [5] "ENSG00000197312" "ENSG00000065526"
> # make data.frame with required structure:
> l1 <- length(clus.genes[[1]])
> l2 <- length(clus.genes[[2]])
> l3 <- length(clus.genes[[3]])
> l4 <- length(clus.genes[[4]])
>
> max.length <- max(l1, l2, l3, l4)
>
> meta.genes <- data.frame(cluster_1 = c(clus.genes[[1]], rep(NA, max.length - l1)),
+ cluster_2 = c(clus.genes[[2]], rep(NA, max.length - l2)),
+ cluster_3 = c(clus.genes[[3]], rep(NA, max.length - l3)),
+ cluster_4 = c(clus.genes[[4]], rep(NA, max.length - l4)))
>
> paged_table(meta.genes)
Metascape enrichment results
Here, the enrichment results are shown as a table:
> meta.data <- read_excel("./Data/metascape_result.xlsx") %>% as.data.frame()
> rownames(meta.data) <- meta.data$original_id
>
> paged_table(meta.data)
> meta.enrich <- read_excel("./Data/metascape_result.xlsx", sheet = 2)
>
> paged_table(meta.enrich)
The top hits of the enrichment were then used to get the enriched genes in the respective pathway. For this the results table of the Metascape enrichment was used.
> genes.hsv.infec <- str_split_1(meta.enrich[2, "Symbols"] %>% as.character(), ",")
>
> genes.cell.resp.stress <- str_split_1(meta.enrich[15, "Symbols"] %>% as.character(), ",")
>
> genes.TNF.sign.pw <- str_split_1(meta.enrich[25, "Symbols"] %>% as.character(), ",")
These gene list were then used to construct a data frame which contains information about if there is a significant H3K27ac peak or not.
> RNAseq.genes <- data.frame(ensembl = meta.data$original_id, symbol = meta.data$`Gene Symbol`)
> rownames(RNAseq.genes) <- RNAseq.genes$symbol
>
> corr.peaks <- ifelse(RNAseq.genes$symbol == peaks.H3ac[RNAseq.genes$symbol, "gene_symbol"],
+ peaks.H3ac[RNAseq.genes$symbol, "peak_pos"],
+ NA)
>
> corr.T.F <- ifelse(is.na(corr.peaks), "No", "Yes")
>
> RNAseq.genes <- mutate(RNAseq.genes, peaks = corr.peaks, corr_peak = corr.T.F %>% as.factor)
>
> pw.m.RNAseq <- rbind(RNAseq.genes[genes.hsv.infec ,],
+ RNAseq.genes[genes.cell.resp.stress ,],
+ RNAseq.genes[genes.TNF.sign.pw ,])
>
> pw <- c(rep("Herpes simplex virus 1 infection", length(genes.hsv.infec)),
+ rep("Regulation of cellular response to stress", length(genes.cell.resp.stress)),
+ rep("TNF signaling pathway", length(genes.TNF.sign.pw)))
>
> pw.m.RNAseq <- mutate(pw.m.RNAseq, pathway = pw)
>
> paged_table(pw.m.RNAseq)
The results were then plotted as follows:
> ggplot(pw.m.RNAseq) +
+ geom_bar(aes(y = pathway, fill = corr_peak), position = "fill", orientation = "y") +
+ labs(y = "", fill = "H3K27ac peak") +
+ theme_bw()
These pathways were then used to calculate the correlation of the respective gene expression with the H3K27ac.
> corr.H3.1 <- corr.H3.3 <- list(HSV = NA, CRS = NA, TNF = NA)
>
> list.pw <- list(genes.hsv.infec, genes.cell.resp.stress, genes.TNF.sign.pw)
>
> for (pw in 1:3) {
+ corr.all.H3.1 <- corr.all.H3.3 <- c()
+ curr.genes <- list.pw[[pw]]
+ curr.ensembl <- pw.m.RNAseq[curr.genes, "ensembl"]
+ curr.peaks <- pw.m.RNAseq[curr.genes, "peaks"]
+ for (i in 1:length(list.pw[[pw]])) {
+ if (is.na(pw.m.RNAseq[curr.genes[i], "peaks"]) == T) {
+ curr.corr.H3.1 <- NA
+ curr.corr.H3.3 <- NA
+ }
+ else {
+ curr.corr.H3.1 <- cor(m.H3.1ac[curr.peaks[i], ], m.RNAseq[curr.ensembl[i], ], method = "pearson")
+ curr.corr.H3.3 <- cor(m.H3.3ac[curr.peaks[i], ], m.RNAseq[curr.ensembl[i], ], method = "pearson")
+ }
+ corr.all.H3.1 <- c(corr.all.H3.1, curr.corr.H3.1)
+ corr.all.H3.3 <- c(corr.all.H3.3, curr.corr.H3.3)
+ }
+ names(corr.all.H3.1) <- names(corr.all.H3.3) <- curr.genes
+ corr.H3.1[[pw]] <- corr.all.H3.1
+ corr.H3.3[[pw]] <- corr.all.H3.3
+ }
Next, a data frame was constructed containing all the required information to plot correlation results. These data was again annotated to the respective construct (H3.1 or H3.3).
> # construct dataframe for each of the constructs
> pw.corr.RNAseq <- mutate(rbind(pw.m.RNAseq, pw.m.RNAseq),
+ R = c(corr.H3.1[["HSV"]], corr.H3.1[["CRS"]], corr.H3.1[["TNF"]],
+ corr.H3.3[["HSV"]], corr.H3.3[["CRS"]], corr.H3.3[["TNF"]]),
+ experiment = rep(c("H3.1K27ac", "H3.3K27ac"), each = nrow(pw.m.RNAseq)) %>% as.factor(),
+ info.corr = NA)
>
> pw.corr.RNAseq$symbol <- pw.corr.RNAseq$symbol %>% as.factor
> pw.corr.RNAseq$pathway <- pw.corr.RNAseq$pathway %>% as.factor
>
> # add factor for corr:
> pw.corr.RNAseq[which(pw.corr.RNAseq$corr_peak == "No"), "info.corr"] <- "no peak"
> pw.corr.RNAseq[which(pw.corr.RNAseq$R %>% abs() <= 0.7), "info.corr"] <- "-0.7 < R < 0.7"
> pw.corr.RNAseq[which(pw.corr.RNAseq$R <= -0.7), "info.corr"] <- "R <= -0.7"
> pw.corr.RNAseq[which(pw.corr.RNAseq$R >= 0.7), "info.corr"] <- "R >= 0.7"
> pw.corr.RNAseq$info.corr <- pw.corr.RNAseq$info.corr %>% as.factor()
>
> paged_table(pw.corr.RNAseq)
To plot the data as a pie chart the counts of the certain events are required, which were calculated as follows:
> pw.corr.H3.1 <- pw.corr.RNAseq[which(pw.corr.RNAseq$experiment == "H3.1K27ac"), ]
> pw.corr.H3.3 <- pw.corr.RNAseq[which(pw.corr.RNAseq$experiment == "H3.3K27ac"), ]
>
> pie.RNAseq <- data.frame(count = c(
+ length(which(pw.corr.H3.1$info.corr == "no peak")),
+ length(which(pw.corr.H3.1$info.corr == "-0.7 < R < 0.7")),
+ length(which(pw.corr.H3.1$info.corr == "R <= -0.7")),
+ length(which(pw.corr.H3.1$info.corr == "R >= 0.7")),
+ length(which(pw.corr.H3.3$info.corr == "no peak")),
+ length(which(pw.corr.H3.3$info.corr == "-0.7 < R < 0.7")),
+ length(which(pw.corr.H3.3$info.corr == "R <= -0.7")),
+ length(which(pw.corr.H3.3$info.corr == "R >= 0.7"))),
+ group = rep(c("no peak", "-0.7 < R < 0.7", "R <= -0.7", "R >= 0.7"), 1),
+ experiment = rep(c("H3.1K27ac", "H3.3K27ac"), each = 4))
>
> paged_table(pie.RNAseq)
The results were then plotted as a pie chart.
> ggplot(pie.RNAseq) +
+ geom_bar(aes(x = "", y = count, fill = group),
+ stat = "identity",
+ position = "fill",
+ width = 1,
+ color = "white") +
+ coord_polar("y", start = 0) +
+ labs(x = "", y = "",
+ title = "Correlation between RNAseq and H3K27ac",
+ fill = "Correlation") +
+ facet_wrap(~ experiment) +
+ theme_minimal()
To show the correlation of each gene in the analyzed pathways, only those genes with a correlation of R >= 0.7 or <= -0.7 were used. These were selected and ordered as follows:
> # filter for R >= 0.7 / <= -0.7:
> pw.corr0.7 <- which(pw.corr.H3.1$R %>% abs() >= 0.7 | pw.corr.H3.3$R %>% abs() >= 0.7)
>
> pw.corr0.7.H3.1 <- pw.corr.H3.1[pw.corr0.7 ,]
> pw.corr0.7.H3.3 <- pw.corr.H3.3[pw.corr0.7 ,]
>
> pw.corr.0.7.RNAseq <- rbind(pw.corr0.7.H3.1, pw.corr0.7.H3.3)
>
> #seperate data by pathway:
> hsv.3.1 <- pw.corr0.7.H3.1[which(pw.corr0.7.H3.1$pathway == "Herpes simplex virus 1 infection"), ]
> rcs.3.1 <- pw.corr0.7.H3.1[which(pw.corr0.7.H3.1$pathway == "Regulation of cellular response to stress"), ]
> tnf.3.1 <- pw.corr0.7.H3.1[which(pw.corr0.7.H3.1$pathway == "TNF signaling pathway"), ]
> hsv.3.3 <- pw.corr0.7.H3.3[which(pw.corr0.7.H3.3$pathway == "Herpes simplex virus 1 infection"), ]
> rcs.3.3 <- pw.corr0.7.H3.3[which(pw.corr0.7.H3.3$pathway == "Regulation of cellular response to stress"), ]
> tnf.3.3 <- pw.corr0.7.H3.3[which(pw.corr0.7.H3.3$pathway == "TNF signaling pathway"), ]
>
> corr.hsv <- rbind(hsv.3.1, hsv.3.3)
> corr.rcs <- rbind(rcs.3.1, rcs.3.3)
> corr.tnf <- rbind(tnf.3.1, tnf.3.3)
>
> # order factors levels by correlation:
> hsv.sum <- hsv.3.1$R + hsv.3.3$R
> rcs.sum <- rcs.3.1$R + rcs.3.3$R
> tnf.sum <- tnf.3.1$R + tnf.3.3$R
>
> corr.hsv <- mutate(corr.hsv, symbol = fct_relevel(symbol, hsv.sum %>% sort %>% names))
> corr.rcs <- mutate(corr.rcs, symbol = fct_relevel(symbol, rcs.sum %>% sort %>% names))
> corr.tnf <- mutate(corr.tnf, symbol = fct_relevel(symbol, tnf.sum %>% sort %>% names))
The correlation of genes with histone acetylation were then plotted for each pathway separately.
> ggplot() +
+ geom_col(data = corr.hsv, aes(x = symbol, y = R, fill = experiment), position = "dodge") +
+ facet_wrap(~ pathway) +
+ labs(x = "gene") +
+ theme_bw() +
+ theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
+ scale_fill_manual(values = c("#FFCC33", "#CC3366"))
> ggplot() +
+ geom_col(data = corr.rcs, aes(x = symbol, y = R, fill = experiment), position = "dodge") +
+ facet_wrap(~ pathway) +
+ labs(x = "gene") +
+ theme_bw() +
+ theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
+ scale_fill_manual(values = c("#FFCC33", "#CC3366"))
> ggplot() +
+ geom_col(data = corr.tnf, aes(x = symbol, y = R, fill = experiment), position = "dodge") +
+ facet_wrap(~ pathway) +
+ labs(x = "gene") +
+ theme_bw() +
+ theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
+ scale_fill_manual(values = c("#FFCC33", "#CC3366"))
The following versions of R and R packages were used to generate the report above:
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Berlin
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] forcats_1.0.0 stringr_1.5.0
## [3] readxl_1.4.2 DESeq2_1.40.1
## [5] SummarizedExperiment_1.30.2 MatrixGenerics_1.12.2
## [7] matrixStats_1.0.0 ggplot2_3.4.2
## [9] dplyr_1.1.2 GenomicFeatures_1.52.0
## [11] AnnotationDbi_1.62.1 Biobase_2.60.0
## [13] GenomicRanges_1.52.0 GenomeInfoDb_1.36.0
## [15] IRanges_2.34.0 S4Vectors_0.38.1
## [17] BiocGenerics_0.46.0 report_0.5.7
## [19] rmarkdown_2.23 knitr_1.43
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.0 farver_2.1.1 blob_1.2.4
## [4] filelock_1.0.2 Biostrings_2.68.1 bitops_1.0-7
## [7] fastmap_1.1.1 RCurl_1.98-1.12 BiocFileCache_2.8.0
## [10] GenomicAlignments_1.36.0 XML_3.99-0.14 digest_0.6.31
## [13] lifecycle_1.0.3 KEGGREST_1.40.0 RSQLite_2.3.1
## [16] magrittr_2.0.3 compiler_4.3.0 rlang_1.1.1
## [19] sass_0.4.6 progress_1.2.2 tools_4.3.0
## [22] utf8_1.2.3 yaml_2.3.7 rtracklayer_1.60.0
## [25] labeling_0.4.2 prettyunits_1.1.1 S4Arrays_1.0.4
## [28] bit_4.0.5 curl_5.0.1 DelayedArray_0.26.3
## [31] xml2_1.3.4 BiocParallel_1.34.2 withr_2.5.0
## [34] grid_4.3.0 fansi_1.0.4 colorspace_2.1-0
## [37] scales_1.2.1 biomaRt_2.56.1 insight_0.19.3
## [40] cli_3.6.1 crayon_1.5.2 generics_0.1.3
## [43] rstudioapi_0.14 httr_1.4.6 rjson_0.2.21
## [46] DBI_1.1.3 cachem_1.0.8 zlibbioc_1.46.0
## [49] parallel_4.3.0 cellranger_1.1.0 XVector_0.40.0
## [52] restfulr_0.0.15 vctrs_0.6.3 Matrix_1.5-4.1
## [55] jsonlite_1.8.5 hms_1.1.3 bit64_4.0.5
## [58] locfit_1.5-9.8 jquerylib_0.1.4 glue_1.6.2
## [61] codetools_0.2-19 stringi_1.7.12 gtable_0.3.3
## [64] BiocIO_1.10.0 munsell_0.5.0 tibble_3.2.1
## [67] pillar_1.9.0 rappdirs_0.3.3 htmltools_0.5.5
## [70] GenomeInfoDbData_1.2.10 R6_2.5.1 dbplyr_2.3.2
## [73] evaluate_0.21 lattice_0.21-8 highr_0.10
## [76] png_0.1-8 Rsamtools_2.16.0 memoise_2.0.1
## [79] bslib_0.5.0 Rcpp_1.0.10 xfun_0.39
## [82] pkgconfig_2.0.3